TIPS
Write commands in code editor of R Studio and run them using icon
Run in R Studio.
We’ll download files gene counts per sample for an example RNA-seq project, and combine them into a single counts table. We’ll also prepare a metadata file for this project.
First, download the table of zip counts at this URL:
https://wd.cri.uic.edu/edgeR/featurecounts.zip
And double-click to unzip it. There should be 6 files, which are from the featureCounts program. Each has gene expression counts for one sample. Here is a preview of the first one (note that long fields are truncated):
# Program:featureCounts v2.0.3; Command:"featureCounts" "-t" "exon" "-g" "gene_id" "-p"
"-T" "1" "-s" "1" "-a" "mm9.gtf" "-o" "A.1" "A.1.bam"
Geneid Chr Start End Strand Length A.1.bam
Xkr4 chr1;chr... 3204563;... 3207049;... -;-;- 3634 0
Rp1 chr1;chr... 4280927;... 4283093;... -;-;-;-;... 9747 7
Sox17 chr1;chr... 4481009;... 4482749;... -;-;-;-;... 3130 229
Mrpl15 chr1;chr... 4763279;... 4764597;... -;-;-;-;... 4203 80
Lypla1 chr1;chr... 4797974;... 4798063;... +;+;+;+;... 2433 184
Tcea1 chr1;chr... 4847775;... 4848057;... +;+;+;+;... 2847 108
Rgs20 chr1;chr... 4899657;... 4900743;... -;-;-;-;... 2241 0
Atp6v1h chr1;chr... 5073254;... 5073359;... +;+;+;+;... 1976 233
There’s a comment line (first line starting with #) giving the command used to generate this file, and after that the fields in the table give the gene ID, all exon coordinates and strands (Chr, Start, End, and Strand), total gene length in bp, and read counts for sample A.1. The read counts (last column) is the data we want for each sample, along with the gene IDs.
What we need to do is:
The steps below assume that the genes are in the same order in each file (we use cbind() without checking that the gene names match). In practice this is true for featureCounts files, but it’s also something that’s good to double-check in general.
# Make a list of all of the files ending in "counts.txt".
# This assumes you've downloaded all of the files to your "Downloads" folder
# and unpacked them there.
# full.names prints the entire directory structure, which we need to read them in.
fc.files <- list.files("~/Downloads/featurecounts",
pattern="*.counts.txt",full.names=T)
# note that you file paths would reflect where the files are on YOUR computer
fc.files
[1] "/Users/mmaiensc/Downloads/featurecounts/A.1.counts.txt"
[2] "/Users/mmaiensc/Downloads/featurecounts/A.2.counts.txt"
[3] "/Users/mmaiensc/Downloads/featurecounts/A.3.counts.txt"
[4] "/Users/mmaiensc/Downloads/featurecounts/B.1.counts.txt"
[5] "/Users/mmaiensc/Downloads/featurecounts/B.2.counts.txt"
[6] "/Users/mmaiensc/Downloads/featurecounts/B.3.counts.txt"
# Read the files into R
# First read in the first file
# NOTE:
# - R ignores comment lines (#), so the first line of the file will be skipped
# - We're subsetting the data frame to just column 6 (gene counts)
# This was column 7, but we set the first column as row names
# - We set drop=F to keep it structured as a data frame (otherwise would become a vector)
counts.table <- read.table(fc.files[1],sep="\t",header=T,row.names=1)[,6,drop=F]
head(counts.table)
# Then read in the rest of the files and combine them
# the selection [-1] drops the first item from the list
for(f in fc.files[-1]){
# here we read in the file and combine it with counts.table in one step
counts.table <- cbind(counts.table, read.table(f,sep="\t",header=T,row.names=1)[,6,drop=F])
}
head(counts.table)
# Remove the ".bam" suffixes from the column names
colnames(counts.table) <- gsub(".bam$","",colnames(counts.table))
head(counts.table)
# now write it to a file
write.table(counts.table,"combined_counts.txt",sep="\t",row.names=T,col.names=NA,quote=F)
You now have a counts table.
To make the metadata file, open up Microsoft Excel and create a file as follows:
Then choose Save As > Tab
delimited Text (.txt). You now have a metadata table.
ABOUT: These data are from an RNA-seq experiment with two experimental groups, A and B. There are three replicates per group.
library(edgeR)
## Loading required package: limma
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/pairwise_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 23420 6
head(data)
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/pairwise_metadata.txt",
header=T,row.names=1)
metadata
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 23420 6
head(data)
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 14193 6
# create edgeR object
genes <- DGEList(counts=data_subset, group=metadata[,1])
genes
## An object of class "DGEList"
## $counts
## A.1 A.2 A.3 B.1 B.2 B.3
## Rp1 7 18 28 18 8 9
## Sox17 229 451 234 367 456 436
## Mrpl15 80 130 104 157 231 207
## Lypla1 184 215 189 292 344 377
## Tcea1 108 192 148 175 208 219
## 14188 more rows ...
##
## $samples
## group lib.size norm.factors
## A.1 A 3326193 1
## A.2 A 5032893 1
## A.3 A 5352456 1
## B.1 B 4979670 1
## B.2 B 5620129 1
## B.3 B 5886895 1
# calculate TMM factors
genes <- calcNormFactors(genes)
genes
## An object of class "DGEList"
## $counts
## A.1 A.2 A.3 B.1 B.2 B.3
## Rp1 7 18 28 18 8 9
## Sox17 229 451 234 367 456 436
## Mrpl15 80 130 104 157 231 207
## Lypla1 184 215 189 292 344 377
## Tcea1 108 192 148 175 208 219
## 14188 more rows ...
##
## $samples
## group lib.size norm.factors
## A.1 A 3326193 1.0236147
## A.2 A 5032893 1.0200193
## A.3 A 5352456 0.8468997
## B.1 B 4979670 1.1083495
## B.2 B 5620129 0.9269172
## B.3 B 5886895 1.1007924
# estimate dispersion
genes <- estimateDisp(genes)
## Using classic mode.
genes
## An object of class "DGEList"
## $counts
## A.1 A.2 A.3 B.1 B.2 B.3
## Rp1 7 18 28 18 8 9
## Sox17 229 451 234 367 456 436
## Mrpl15 80 130 104 157 231 207
## Lypla1 184 215 189 292 344 377
## Tcea1 108 192 148 175 208 219
## 14188 more rows ...
##
## $samples
## group lib.size norm.factors
## A.1 A 3326193 1.0236147
## A.2 A 5032893 1.0200193
## A.3 A 5352456 0.8468997
## B.1 B 4979670 1.1083495
## B.2 B 5620129 0.9269172
## B.3 B 5886895 1.1007924
##
## $common.dispersion
## [1] 0.07726973
##
## $trended.dispersion
## [1] 0.15541353 0.05724855 0.06490358 0.05832588 0.06245742
## 14188 more elements ...
##
## $tagwise.dispersion
## [1] 0.17099271 0.05072614 0.04405357 0.03575971 0.03376216
## 14188 more elements ...
##
## $AveLogCPM
## [1] 1.746894 6.165060 4.900612 5.724314 5.127011
## 14188 more elements ...
##
## $trend.method
## [1] "locfit"
##
## $prior.df
## [1] 4.134752
##
## $prior.n
## [1] 1.033688
##
## $span
## [1] 0.2945153
# calculate BCV from the common dispersion
sqrt(genes$common.dispersion)
## [1] 0.2779743
plotBCV(genes)
# run differential
stats <- exactTest(genes)
# NOTE: logFC will be computed as B/A: positive means higher in B
# edgeR calculates this as the second group over the first
# group from exactTest
stats
## An object of class "DGEExact"
## $table
## logFC logCPM PValue
## Rp1 -0.93760389 1.746894 0.12038547
## Sox17 0.09658678 6.165060 0.72540551
## Mrpl15 0.54242240 4.900612 0.04323394
## Lypla1 0.36738965 5.724314 0.12314337
## Tcea1 0.04750527 5.127011 0.84813699
## 14188 more rows ...
##
## $comparison
## [1] "A" "B"
##
## $genes
## NULL
# run FDR correction
stats$table$QValue <- p.adjust(stats$table$PValue,method="BH")
head(stats$table)
sum(stats$table$QValue<0.05)
## [1] 684
# calculate normalized expression
norm <- cpm(genes)
head(norm)
## A.1 A.2 A.3 B.1 B.2 B.3
## Rp1 2.055957 3.506278 6.176934 3.261333 1.535687 1.388835
## Sox17 67.259172 87.851756 51.621518 66.494965 87.534172 67.281361
## Mrpl15 23.496654 25.323122 22.942897 28.446075 44.342969 31.943215
## Lypla1 54.042304 41.880549 41.694303 52.906076 66.034551 58.176773
## Tcea1 31.720483 37.400304 32.649507 31.707408 39.927868 33.794996
## Atp6v1h 68.434005 85.903823 81.623768 79.359114 121.895174 72.682388
# log-scale the data first (this is good practice, as gene expression varies
# over several orders of magnitude)
# add a pseudo-count (0.1) to avoid taking the log of 0
norm.log <- log2(norm + 0.1)
# run PCA on the transpose of cpm.log: we want to plot the SAMPLES
pca <- prcomp(t(norm.log))
# make colors for our plot
pca.colors <- c(rep("Red",3),rep("Blue",3))
plot(pca$x[,1],pca$x[,2],col=pca.colors,xlab="PC1",ylab="PC2",pch=16)
# add sample name labels to plot:
# pos=4 makes them left-justified
# xpd=NA lets the labels go off of the plot area
text(pca$x[,1],pca$x[,2],colnames(norm),pos=4,xpd=NA)
legend('topright',legend=c("Group A","Group B"),col=c("Red","Blue"),pch=16)
# check percent variation per PC
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 58.6293 31.5442 27.0428 24.24291 19.72479 6.857e-14
## Proportion of Variance 0.5598 0.1620 0.1191 0.09571 0.06336 0.000e+00
## Cumulative Proportion 0.5598 0.7218 0.8409 0.93664 1.00000 1.000e+00
# this is the vector of % variance per PC
summary(pca)$importance[2,]
## PC1 PC2 PC3 PC4 PC5 PC6
## 0.55979 0.16204 0.11910 0.09571 0.06336 0.00000
screeplot(pca)
# add the % variation to the axis labels
xlabel <- paste("PC1 (", 100*summary(pca)$importance[2,1], "%)",sep="")
ylabel <- paste("PC2 (", 100*summary(pca)$importance[2,2], "%)",sep="")
plot(pca$x[,1],pca$x[,2],col=pca.colors,xlab=xlabel,ylab=ylabel,pch=16)
text(pca$x[,1],pca$x[,2],colnames(norm),pos=4,xpd=NA)
legend('topright',legend=c("Group A","Group B"),col=c("Red","Blue"),pch=16)
We can also make the plot with ggplot, which is a bit easier. For later exercises we will plot with ggplot2.
# combine coordinates with metadata and sample names
pca.data <- cbind(pca$x,metadata,Sample=rownames(metadata))
head(pca.data)
library(ggplot2)
# geom_label adds the sample labels; hjust=0 and vjust=0 makes it left-justified
# coord_cartesian(clip='off') lets the labels go outside the plot boundary
ggplot(pca.data,aes(x=PC1,y=PC2,label=Sample,color=Group)) +
geom_point() + geom_text(hjust=0, vjust=0, show.legend=F, color="black") +
coord_cartesian(clip='off') + labs(x=xlabel, y=ylabel)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.18.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
# Get a list of the differentially expressed genes, using a more stringent Q-value
degs.diff <- stats$table[stats$table$QValue < 0.05,]
dim(degs.diff)
## [1] 684 4
# subset the log-scaled CPMs to get the same genes
degs.norm <- norm.log[rownames(degs.diff),]
# we want to z-score across the genes, to account for gene-to-gene
# differences in baseline expression
# need to transpose twice, as the scale function works on columns
degs.norm.z <- t(scale(t(degs.norm)))
# scale the colors from blue to white to red, with max/min at +/-2 at the middle at 0
col_fun <- colorRamp2(c(-2,0,2),c("blue","white","red"))
# basic heatmap, turning off the row names
Heatmap(degs.norm.z,show_row_names=F,name="Z-scored log2 CPM",col=col_fun)
Notice that sample B.2 is a little bit “different” from the other B samples, with more extreme colors. Sample B.2 is also the more “different” one in the PCA plot. Overall, B.2 is more different from the group A samples than the other B samples, which we can see in both the PCA plot and the heatmap.
How can we get the genes that correspond to the heatmap?
# use the row_split option to get 2 clusters
# save as an object also; draw() will also plot it for us
ht <- draw(Heatmap(degs.norm.z,show_row_names=F,name="Z-scored log2 CPM",
col=col_fun,row_split=2))
# get the order of genes in the heatmap
genes_order <- row_order(ht)
NOTE: genes_order is a list of 2 vectors. Each vector is
the order (indices) of the genes for each row cluster. To get the actual
gene names for each cluster, we would need to select the names from the
degs.norm.z based on the indices in the vectors.
# get gene names for the first cluster
cluster1_genes <- rownames(degs.norm.z)[ genes_order[[1]] ]
head(cluster1_genes)
## [1] "Eif2s3y" "Fbn2" "Rbm20" "Cst8" "Kcnj2" "Gtpbp2"
# use an apply statement to get the genes from all clusters
gene_clusters <- sapply(genes_order, function(x) rownames(degs.norm.z)[x])
# gene_clusters will also be a list
head(gene_clusters[[1]])
## [1] "Eif2s3y" "Fbn2" "Rbm20" "Cst8" "Kcnj2" "Gtpbp2"
head(gene_clusters[[2]])
## [1] "Ptafr" "Fkbp10" "Socs3" "Lsm12" "Cd200r1" "Wdr46"
You can save a really tall heatmap (with gene names turned ON) to a PDF to double-check the gene orders.
# make a really long PDF and inspect the gene order manually
pdf("pairwise_long_heatmap.pdf",height=100)
Heatmap(degs.norm.z,show_row_names=T,name="Z-scored log2 CPM",
col=col_fun,row_split=2)
dev.off()
ABOUT: These data are from an RNA-seq experiment comparing two different disease models (model1 and model2) to a healthy control group, so there are 3 experimental groups in total. There are 4 replicates per group.
# reload the edgeR library in case you closed R studio
library(edgeR)
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/anova_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 24421 12
head(data)
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/anova_metadata.txt",
header=T,row.names=1)
metadata
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 24421 12
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 16713 12
# define our factors and model matrix
treat <- factor(metadata[,1])
treat
## [1] Control Control Control Control Model1 Model1 Model1 Model1 Model2
## [10] Model2 Model2 Model2
## Levels: Control Model1 Model2
model <- model.matrix(~treat)
model
## (Intercept) treatModel1 treatModel2
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 1 0
## 6 1 1 0
## 7 1 1 0
## 8 1 1 0
## 9 1 0 1
## 10 1 0 1
## 11 1 0 1
## 12 1 0 1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$treat
## [1] "contr.treatment"
# create edgeR object, without setting the groups this time
genes <- DGEList(counts=data_subset)
# calculate TMM factors
genes <- calcNormFactors(genes)
# estimate dispersion
genes <- estimateDisp(genes,model)
# calculate BCV from the common dispersion
sqrt(genes$common.dispersion)
## [1] 0.1719392
plotBCV(genes)
# fit the model
fit <- glmQLFit(genes,model)
# we are testing if either coefficient 2 or 3 has an effect
qlf <- glmQLFTest(fit, coef=2:3)
qlf
## An object of class "DGELRT"
## $coefficients
## (Intercept) treatModel1 treatModel2
## Xkr4 -12.905525 0.102388201 -0.10083802
## Sox17 -11.940529 -0.003795860 -0.09714796
## Mrpl15 -9.858317 -0.088182239 -0.11520116
## Lypla1 -10.181974 0.003604001 -0.16800056
## Tcea1 -9.953796 0.247642882 0.06200444
## 16708 more rows ...
##
## $fitted.values
## Control.1 Control.2 Control.3 Control.4 Model1.10 Model1.7 Model1.8
## Xkr4 28.1162 27.83997 31.44391 29.64327 56.91949 36.68507 38.20549
## Sox17 73.9728 73.24605 82.72790 77.99047 134.61683 86.76163 90.35748
## Mrpl15 594.1755 588.33800 664.49950 626.44687 993.77219 640.49417 667.03958
## Lypla1 429.8554 425.63228 480.73121 453.20209 788.06930 507.91701 528.96773
## Tcea1 540.0582 534.75244 603.97719 569.39037 1263.82181 814.54333 848.30224
## Model1.9 Model2.1 Model2.2 Model2.3 Model2.4
## Xkr4 34.97135 30.16443 25.02012 30.34803 28.77078
## Sox17 82.70862 79.67564 66.08758 80.16061 75.99450
## Mrpl15 610.57388 628.61220 521.40729 632.43842 599.56934
## Lypla1 484.18998 431.36985 357.80308 433.99550 411.43989
## Tcea1 776.49244 682.15466 565.81850 686.30678 650.63805
## 16708 more rows ...
##
## $deviance
## Xkr4 Sox17 Mrpl15 Lypla1 Tcea1
## 5.444725 5.365753 6.818389 6.925917 8.804578
## 16708 more elements ...
##
## $method
## [1] "oneway"
##
## $unshrunk.coefficients
## (Intercept) treatModel1 treatModel2
## Xkr4 -12.909326 0.102759575 -0.10124567
## Sox17 -11.941974 -0.003797212 -0.09729660
## Mrpl15 -9.858497 -0.088198944 -0.11522322
## Lypla1 -10.182223 0.003604917 -0.16804610
## Tcea1 -9.953995 0.247686359 0.06201635
## 16708 more rows ...
##
## $df.residual
## [1] 9 9 9 9 9
## 16708 more elements ...
##
## $design
## (Intercept) treatModel1 treatModel2
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 1 0
## 7 more rows ...
##
## $offset
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 16.24567 16.2358 16.35753 16.29856 16.8482 16.40894 16.44955 16.3611
## [,9] [,10] [,11] [,12]
## [1,] 16.41723 16.23025 16.4233 16.36993
## attr(,"class")
## [1] "CompressedMatrix"
## attr(,"Dims")
## [1] 5 12
## attr(,"repeat.row")
## [1] TRUE
## attr(,"repeat.col")
## [1] FALSE
## 16708 more rows ...
##
## $dispersion
## [1] 0.06742343 0.04528205 0.02207476 0.02321379 0.02182389
## 16708 more elements ...
##
## $prior.count
## [1] 0.125
##
## $AveLogCPM
## [1] 1.401471 2.691206 5.617022 5.173267 5.732831
## 16708 more elements ...
##
## $df.residual.zeros
## [1] 9 9 9 9 9
## 16708 more elements ...
##
## $df.prior
## [1] 8.589705
##
## $var.post
## Xkr4 Sox17 Mrpl15 Lypla1 Tcea1
## 0.7549259 0.7015419 0.7345577 0.7452369 0.8463764
## 16708 more elements ...
##
## $var.prior
## Xkr4 Sox17 Mrpl15 Lypla1 Tcea1
## 0.9120452 0.8119211 0.7104161 0.7197664 0.7081655
## 16708 more elements ...
##
## $samples
## group lib.size norm.factors
## Control.1 1 11112335 1.0223515
## Control.2 1 10765117 1.0449583
## Control.3 1 12414349 1.0234380
## Control.4 1 12877683 0.9301165
## Model1.10 1 21699160 0.9563989
## 7 more rows ...
##
## $table
## logFC.treatModel1 logFC.treatModel2 logCPM F PValue
## Xkr4 0.147714950 -0.1454785 1.401471 0.5676494 0.57690230
## Sox17 -0.005476268 -0.1401549 2.691206 0.2964354 0.74710527
## Mrpl15 -0.127220079 -0.1662001 5.617022 0.8403515 0.44816641
## Lypla1 0.005199475 -0.2423736 5.173267 1.9911074 0.16615373
## Tcea1 0.357273158 0.0894535 5.732831 3.4387713 0.05488631
## 16708 more rows ...
##
## $comparison
## [1] "treatModel1" "treatModel2"
##
## $df.test
## [1] 2 2 2 2 2
## 16708 more elements ...
##
## $df.total
## [1] 17.58971 17.58971 17.58971 17.58971 17.58971
## 16708 more elements ...
# do p-value adjustment
qlf$table$QValue <- p.adjust(qlf$table$PValue,method="BH")
head(qlf$table)
# check number of significant genes
sum(qlf$table$QValue<0.05)
## [1] 5710
# make heatmap of differentially expressed genes
# directly generate log-scaled gene expression this time
norm <- cpm(genes, log=T)
# subset based on degs
degs.diff <- qlf$table[qlf$table$QValue < 0.05,]
dim(degs.diff)
## [1] 5710 6
# subset the (already) log-scaled CPMs to get the same genes
degs.norm <- norm[rownames(degs.diff),]
# z-score across the genes
degs.norm.z <- t(scale(t(degs.norm)))
# basic heatmap
Heatmap(degs.norm.z,show_row_names=F,name="One-way ANOVA, Z-scored log2 CPM",
col=col_fun)
# make the model, this time without the intercept
model.alt <- model.matrix(~0+treat)
model.alt
## treatControl treatModel1 treatModel2
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 0 1 0
## 6 0 1 0
## 7 0 1 0
## 8 0 1 0
## 9 0 0 1
## 10 0 0 1
## 11 0 0 1
## 12 0 0 1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$treat
## [1] "contr.treatment"
# create a new edgeR object
genes.alt <- DGEList(counts=data_subset)
# calculate TMM factors
genes.alt <- calcNormFactors(genes.alt)
# estimate dispersion
genes.alt <- estimateDisp(genes.alt,model.alt)
# calculate BCV from the common dispersion
# should be the same as last time: these models are ultimately equivalent in terms of how
# they model the data, but differ in how easily we can ask different questions of them
sqrt(genes.alt$common.dispersion)
## [1] 0.1719392
# fit the model
fit.alt <- glmQLFit(genes.alt,model.alt)
# make a couple contrasts to try
# pair-wise comparison between model1 and model2
comp1 <- makeContrasts(treatModel1 - treatModel2, levels=model.alt)
# comparison of control vs average of model1 and model2
comp2 <- makeContrasts(treatControl - (treatModel1 + treatModel2)/2, levels=model.alt)
# do qlf tests for these comparisons
qlf1 <- glmQLFTest(fit.alt, contrast=comp1)
qlf2 <- glmQLFTest(fit.alt, contrast=comp2)
# do p-value adjustment for each
qlf1$table$QValue <- p.adjust(qlf1$table$PValue,method="BH")
qlf2$table$QValue <- p.adjust(qlf2$table$PValue,method="BH")
# make heatmap of the DEGs
# we can use the same normalized expression that we got above, already in log-scale
# subset based on degs
degs.diff1 <- qlf1$table[qlf1$table$QValue < 0.05,]
degs.diff2 <- qlf2$table[qlf2$table$QValue < 0.05,]
dim(degs.diff1)
## [1] 3688 5
dim(degs.diff2)
## [1] 4030 5
# how many genes are significant in both cases?
length(intersect(rownames(degs.diff1),rownames(degs.diff2)))
## [1] 1914
# next make heatmaps for DEGs for each factor
# subset the (already) log-scaled CPMs to get the same genes
degs.norm1 <- norm[rownames(degs.diff1),]
degs.norm2 <- norm[rownames(degs.diff2),]
# z-score across the genes
degs.norm.z1 <- t(scale(t(degs.norm1)))
degs.norm.z2 <- t(scale(t(degs.norm2)))
# heatmap 1: these are genes where model1 and model2 are different
Heatmap(degs.norm.z1,show_row_names=F,name="Contrast 1, Z-scored log2 CPM",
col=col_fun)
# heatmap 2: these are genes where the average of model1 and model2 is dfferent from control
Heatmap(degs.norm.z2,show_row_names=F,name="Contrast 2, Z-scored log2 CPM",
col=col_fun)
ABOUT: These are data from a mouse RNA-seq experiment where animals were subjected to a disease model and allowed to recover; we have disease conditions control (no disease), peak disease, and recovery. Two different nerve tissues were then collected to profile gene expression: the dorsal root ganglion (DRG) and the sciatic nerve. The goal is to look for similarities and differences in the tissue responses to the disease model.
# load the edgeR library, if you've restarted R
library(edgeR)
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/twofactor_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 39056 18
head(data)
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/twofactor_metadata.txt",
header=T,row.names=1)
metadata
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 39056 18
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 20564 18
# define our factors and model matrix
disease <- factor(metadata[,1])
tissue <- factor(metadata[,2])
model <- model.matrix(~ disease + tissue + disease:tissue)
model
## (Intercept) diseasePeak_disease diseaseRecovery tissueSciatic_nerve
## 1 1 1 0 1
## 2 1 1 0 1
## 3 1 1 0 1
## 4 1 1 0 0
## 5 1 1 0 0
## 6 1 1 0 0
## 7 1 0 0 0
## 8 1 0 0 0
## 9 1 0 0 0
## 10 1 0 0 1
## 11 1 0 0 1
## 12 1 0 0 1
## 13 1 0 1 0
## 14 1 0 1 0
## 15 1 0 1 0
## 16 1 0 1 1
## 17 1 0 1 1
## 18 1 0 1 1
## diseasePeak_disease:tissueSciatic_nerve diseaseRecovery:tissueSciatic_nerve
## 1 1 0
## 2 1 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 0 0
## 7 0 0
## 8 0 0
## 9 0 0
## 10 0 0
## 11 0 0
## 12 0 0
## 13 0 0
## 14 0 0
## 15 0 0
## 16 0 1
## 17 0 1
## 18 0 1
## attr(,"assign")
## [1] 0 1 1 2 3 3
## attr(,"contrasts")
## attr(,"contrasts")$disease
## [1] "contr.treatment"
##
## attr(,"contrasts")$tissue
## [1] "contr.treatment"
# create edgeR object
genes <- DGEList(counts=data_subset)
# calculate TMM factors
genes <- calcNormFactors(genes)
# estimate dispersion
genes <- estimateDisp(genes,model)
# calculate BCV from the common dispersion
sqrt(genes$common.dispersion)
## [1] 0.1542001
plotBCV(genes)
# fit the model
fit <- glmQLFit(genes,model)
# now we test each factor in turn, keeping in mind
# which coefficients these correspond to
# first test for genotype (1 term)
qlf.disease <- glmQLFTest(fit, coef=2:3)
qlf.tissue <- glmQLFTest(fit, coef=4)
qlf.inter <- glmQLFTest(fit, coef=5:6)
# do p-value adjustment
qlf.disease$table$QValue <- p.adjust(qlf.disease$table$PValue,method="BH")
qlf.tissue$table$QValue <- p.adjust(qlf.tissue$table$PValue,method="BH")
qlf.inter$table$QValue <- p.adjust(qlf.inter$table$PValue,method="BH")
# check number of significant genes
sum(qlf.disease$table$QValue<0.05)
## [1] 758
sum(qlf.tissue$table$QValue<0.05)
## [1] 13497
sum(qlf.inter$table$QValue<0.05)
## [1] 2102
We want to find genes affected by the disease, but then evaluate them separately depending on whether they behaved differently or similarly between the tissues. We will use the interaction term to sort based on different vs similar effect.
First let’s make a PCA plot.
# load the ggplot2, heatmap, and circlize packages if you have restarted R
library(ggplot2)
library(ComplexHeatmap)
library(circlize)
# generate log-scaled normalized expression
norm <- cpm(genes, log=T)
# run PCA and plot with respect to each factor
pca <- prcomp(t(norm))
pca.data <- cbind(pca$x, metadata, Sample=rownames(metadata))
ggplot(pca.data,aes(x=PC1,y=PC2,label=Sample,color=Disease)) +
geom_point() + geom_text(hjust=0, vjust=0, show.legend=F, color="black") +
coord_cartesian(clip='off')
ggplot(pca.data,aes(x=PC1,y=PC2,label=Sample,color=Tissue)) +
geom_point() + geom_text(hjust=0, vjust=0, show.legend=F, color="black") +
coord_cartesian(clip='off')
# Create combined PCA plot (Tissue as plot shape and Disease as color)
ggplot(pca.data,aes(x=PC1,y=PC2, label=Sample, color=Disease, shape=Tissue)) +
geom_point() + geom_text(hjust=0, vjust=0, show.legend=F, color="black") +
coord_cartesian(clip='off')
# check screeplot also
screeplot(pca)
# you can choose to add x and y labels with the % variance if you want
Above, we can see that the strongest effect (along PC1) is due to tissue. This matches the number of DEGs observed, and would be expected as tissues will generally have very different transcriptomic profiles.
We will determine a filtering strategy using the two-way ANOVA statistics, and make heatmaps from each option.
# make z-scored normalized expression
norm.z <- t(scale(t(norm)))
# make a Boolean selection vector for effects due to
# disease or interaction term
disease.sel <- qlf.disease$table$QValue < 0.05
inter.sel <- qlf.inter$table$QValue < 0.05
# genes with tissue-specific effects
# significant for disease AND interaction
different_effect <- norm.z[disease.sel & inter.sel,]
nrow(different_effect)
## [1] 310
# genes with disease effect, but no interaction
# significant for disease but NOT interaction
shared_effect <- norm.z[disease.sel & !inter.sel,]
nrow(shared_effect)
## [1] 448
# prepare color scales and heatmap annotations
col_fun <- colorRamp2(c(-2,0,2),c("blue","white","red"))
col_labels <- HeatmapAnnotation(df=metadata)
# make heatmaps of each of these
Heatmap(different_effect,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F)
Heatmap(shared_effect,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F)
These look mostly the same, and they don’t show any differences based on disease! That’s because the effect size, and in turn the color scale, is dominated by the tissue differences.
What we really want to see are the disease effects in each tissue separately. We can fix this by z-scoring separately for each tissue.
# make separate norm tables for each tissue
norm.drg <- norm[, tissue == "DRG"]
norm.sciatic <- norm[, tissue == "Sciatic_nerve"]
# z-score each
norm.drg.z <- t(scale(t(norm.drg)))
norm.sciatic.z <- t(scale(t(norm.sciatic)))
# combine them back together
norm.z <- cbind(norm.drg.z,norm.sciatic.z)
# reorder the columns to match the original sample order
norm.z <- norm.z[, colnames(norm)]
# now re-filter for our effects and make the heatmaps
# don't be afraid to copy-paste from above
different_effect <- norm.z[disease.sel & inter.sel,]
shared_effect <- norm.z[disease.sel & !inter.sel,]
# add an extra parameter to make side-by-side heatmaps for the tissues
# we can give column_split a vector of categories to split on
Heatmap(different_effect,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F,
column_split = tissue)
Heatmap(shared_effect,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F,
column_split = tissue)
The “different effect” heatmap shows a variety of patterns, such as genes that are down-regulated in peak disease in sciatic nerve, followed by an increase to normal in recovery; these same genes increase in the DRG in peak disease, and remain elevated in recovery.
In the “shared effect” heatmap, we see a few different patterns, but they are shared across tissues.
These are some examples of how we would use different filtering criteria and/or normalization strategies to address different questions.
ABOUT: These data are from a 16S amplicon experiment where the same cohort of animals was subject to one of three treatments (A, B, or C) over a 5-day time course. Samples were collected before treatmeant (Pre), and after 2, 3, and 5 days. The counts table are taxonomic summaries at a genus level. We’ll run a repeated measures test controlling for animal-to-animal differences and testing for differences between time points and treatments.
# read in counts
## include sep="\t" to tab-separate: taxa names have spaces in them
data <- read.table("https://wd.cri.uic.edu/edgeR/16S_counts.txt",
sep="\t",header=T,row.names=1)
dim(data)
## [1] 218 60
head(data)
S001_1_107 S002_2_108
Bacteria;Verrucomicrobiota;Verrucomicrobiae;Verrucomicrobiales;Akkermansiaceae;Akkermansia 5414 6918
Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter 1538 0
Bacteria;Firmicutes;Clostridia;Oscillospirales;Ruminococcaceae;Ruminococcus 9123 5877
Bacteria;Firmicutes;Clostridia;Oscillospirales;Oscillospiraceae;NK4A214 group 3696 3613
Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides 5298 7852
Bacteria;Bacteroidota;Bacteroidia;Bacteroidales;Rikenellaceae;Alistipes 592 1587
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/16S_metadata.txt",
header=T,row.names=1)
head(metadata)
# check distribution of animal IDs over time points
## see which animals were subjected to different treatments
## note that some treatments are missing a couple animals
table(metadata)
## AnimalID
## Group 107 108 109 110 111 113 114 115 116 117 118 120 121 122 123
## Day2.A 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## Day2.B 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0
## Day2.C 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
## Day3.A 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## Day3.B 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0
## Day3.C 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
## Day5.A 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
## Day5.B 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0
## Day5.C 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
## Pre 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# make sure the metadata row order matches the data column order
nrow(metadata)
## [1] 55
ncol(data)
## [1] 60
## NOTE: sample numbers are different: check what's different
## missing samples are:
## - 3 samples from animal 119 -- did not have a Pre sample, excluded from metadata
## - 2 "NC" samples: negative controls -- we want to exclude from analysis
colnames(data)[!colnames(data) %in% rownames(metadata)]
## [1] "S016_NC1" "S028_27_119" "S043_42_119" "S048_NC2" "S058_56_119"
# subset the data to match
data <- data[,rownames(metadata)]
dim(data)
## [1] 218 55
# subset counts to taxa with >500 total counts
## we have 218 samples, so this is an average of 2.3 counts per sample
## we are also most interested in the more abundant taxa
## lower abundant taxa are more likely to be data processing artifacts
data_subset <- data[rowSums(data)>500,]
dim(data_subset)
## [1] 121 55
# define factors and model matrix
treat <- factor(metadata[,1])
subject <- factor(metadata[,2])
model <- model.matrix( ~ 0 + treat + subject )
# create edgeR object and fit the model
taxa <- DGEList(counts=data_subset)
taxa <- calcNormFactors(taxa)
taxa <- estimateDisp(taxa, model)
sqrt(taxa$common.dispersion)
## [1] 0.7809027
fit <- glmQLFit(taxa, model)
# run contrasts
## look at coefficient names in the model
head(model)
## treatDay2.A treatDay2.B treatDay2.C treatDay3.A treatDay3.B treatDay3.C
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## treatDay5.A treatDay5.B treatDay5.C treatPre subject108 subject109 subject110
## 1 0 0 0 1 0 0 0
## 2 0 0 0 1 1 0 0
## 3 0 0 0 1 0 1 0
## 4 0 0 0 1 0 0 1
## 5 0 0 0 1 0 0 0
## 6 0 0 0 1 0 0 0
## subject111 subject113 subject114 subject115 subject116 subject117 subject118
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0
## 5 1 0 0 0 0 0 0
## 6 0 1 0 0 0 0 0
## subject120 subject121 subject122 subject123
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
# run a contrast:
## Day5A vs Pre
contrast <- makeContrasts( treatDay5.A - treatPre, levels = model )
qlf <- glmQLFTest( fit, contrast=contrast )
qlf$table$QValue <- p.adjust(qlf$table$PValue,method="BH")
# computed normalized values adjusted for repeated measures effects
## we'll cover batch effect removal again in the next exercise
taxa.norm <- cpm(taxa, log=T)
taxa.norm <- removeBatchEffect(taxa.norm, subject)
# heatmap of these taxa
taxa.signif <- rownames(qlf$table)[qlf$table$QValue < 0.05]
length(taxa.signif)
## [1] 13
norm.signif <- taxa.norm[taxa.signif,]
norm.signif <- t(scale(t(norm.signif)))
Heatmap(norm.signif,col=col_fun,name="Z-scored log2 CPM")
# clean up: make taxa names shorter
## first remove "Other" taxa at the end
## (the "$" anchors the search to the end of the string)
## then remove all higher-level taxa names
rownames(norm.signif) <- gsub(";Other$","",rownames(norm.signif))
rownames(norm.signif) <- gsub(".*;","",rownames(norm.signif))
Heatmap(norm.signif,col=col_fun,name="Z-scored log2 CPM")
# add color bar on top with group names
group.labels <- HeatmapAnnotation(Group=treat)
Heatmap(norm.signif,col=col_fun, name="Z-scored log2 CPM",
top_annotation = group.labels)
# force the groups to be together
Heatmap(norm.signif,col=col_fun, name="Z-scored log2 CPM",
top_annotation = group.labels, column_split = treat)
# only plot group 5A vs pre
contrast.sel <- treat == "Day5.A" | treat == "Pre"
norm.signif.subset <- norm.signif[, contrast.sel]
subset.labels <- HeatmapAnnotation(Group=treat[contrast.sel])
Heatmap(norm.signif.subset,col=col_fun, name="Z-scored log2 CPM",
top_annotation = subset.labels)
Exercise to try at home: there are a LOT more contrasts we could run on these data. Can you figure out how to do Day3A vs Pre? How about the average of Day2 vs Pre (how would you do an average of multiple groups in a contrast)?
ABOUT: These data are from an RNA-seq experiment comparing B cells from five mouse genotypes, with three replicates per group. However, the replicates were collected at different times, so there is a batch effect present, especially for replicate 1.
# reload the edgeR library in case you closed R studio
library(edgeR)
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/batch_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 23366 15
head(data)
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/batch_metadata.txt",
header=T,row.names=1)
metadata
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 23366 15
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 14037 15
# define our factors
geno <- factor(metadata[,1])
batch <- factor(metadata[,2])
First, let’s try the analysis without the batch correction
# just model the genotype
model <- model.matrix(~ geno)
model
## (Intercept) genoB genoC genoD genoE
## 1 1 0 0 0 0
## 2 1 0 0 0 0
## 3 1 0 0 0 0
## 4 1 1 0 0 0
## 5 1 1 0 0 0
## 6 1 1 0 0 0
## 7 1 0 1 0 0
## 8 1 0 1 0 0
## 9 1 0 1 0 0
## 10 1 0 0 1 0
## 11 1 0 0 1 0
## 12 1 0 0 1 0
## 13 1 0 0 0 1
## 14 1 0 0 0 1
## 15 1 0 0 0 1
## attr(,"assign")
## [1] 0 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$geno
## [1] "contr.treatment"
# create edgeR object
genes <- DGEList(counts=data_subset)
# calculate TMM factors
genes <- calcNormFactors(genes)
# estimate dispersion
genes <- estimateDisp(genes,model)
# calculate BCV from the common dispersion
sqrt(genes$common.dispersion)
## [1] 0.3957431
# fit the model
fit <- glmQLFit(genes,model)
# run ANOVA: test all coefficients other than the intercept
qlf <- glmQLFTest(fit, coef=2:5)
# do p-value adjustment
qlf$table$QValue <- p.adjust(qlf$table$PValue,method="BH")
# check number of significant genes
sum(qlf$table$QValue)
## [1] 7020.937
# generate log-scaled normalized expression
norm <- cpm(genes, log=T)
# make colors for each factor for plotting purposes
# we're going to use the trick again by renaming the factor levels
# genotype colors
geno.colors <- geno
levels(geno.colors) = rainbow(length(levels(geno.colors)))
# run PCA
pca <- prcomp(t(norm))
pca.data <- cbind(pca$x, metadata, Sample=rownames(metadata))
ggplot(pca.data,aes(x=PC1,y=PC2)) +
geom_point(aes(color=Genotype)) +
geom_text(aes(label=Sample), hjust=0, vjust=0, show.legend=F) +
coord_cartesian(clip='off')
ggplot(pca.data,aes(x=PC1,y=PC2)) +
geom_point(aes(color=as.factor(Batch))) +
geom_text(aes(label=Sample), hjust=0, vjust=0, show.legend=F) +
coord_cartesian(clip='off')
ggplot(pca.data,aes(x=PC1,y=PC2)) +
geom_point(aes(shape=as.factor(Batch), color=Genotype)) +
geom_text(aes(label=Sample), hjust=0, vjust=0, show.legend=F) +
coord_cartesian(clip='off')
Note that replicate 1 samples are to the left, and other samples are to the right. This separation is along PC1, which is capturing the biggest portion of the variance. So, there is a batch effect, and it’s pretty strong.
Now, with the batch effect correction
# new model matrix with batch factor; no interaction term
batch.model <- model.matrix(~ geno + batch)
batch.model
## (Intercept) genoB genoC genoD genoE batch2 batch3
## 1 1 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0
## 3 1 0 0 0 0 0 1
## 4 1 1 0 0 0 0 0
## 5 1 1 0 0 0 1 0
## 6 1 1 0 0 0 0 1
## 7 1 0 1 0 0 0 0
## 8 1 0 1 0 0 1 0
## 9 1 0 1 0 0 0 1
## 10 1 0 0 1 0 0 0
## 11 1 0 0 1 0 1 0
## 12 1 0 0 1 0 0 1
## 13 1 0 0 0 1 0 0
## 14 1 0 0 0 1 1 0
## 15 1 0 0 0 1 0 1
## attr(,"assign")
## [1] 0 1 1 1 1 2 2
## attr(,"contrasts")
## attr(,"contrasts")$geno
## [1] "contr.treatment"
##
## attr(,"contrasts")$batch
## [1] "contr.treatment"
# create edgeR object
batch.genes <- DGEList(counts=data_subset)
# calculate TMM factors
batch.genes <- calcNormFactors(batch.genes)
# estimate dispersion
batch.genes <- estimateDisp(batch.genes,batch.model)
# calculate BCV from the common dispersion
# should be much smaller!
sqrt(batch.genes$common.dispersion)
## [1] 0.2762293
# fit the model
batch.fit <- glmQLFit(batch.genes,batch.model)
# test again for the genotype effect - this test looks the same as before
# the batch terms are just "along for the ride"
batch.qlf <- glmQLFTest(batch.fit, coef=2:5)
# do p-value adjustment
batch.qlf$table$QValue <- p.adjust(batch.qlf$table$PValue,method="BH")
# check number of significant genes
# should be many more!
sum(batch.qlf$table$QValue)
## [1] 4621.895
# correct for batch effect: we just need to give the batch factor vector
# we can start with the same normalized expression as above
batch.norm <- removeBatchEffect(norm, batch)
# run PCA
batch.pca <- prcomp(t(batch.norm))
batch.pca.data <- cbind(batch.pca$x, metadata, Sample=rownames(metadata))
# this looks better, separation based on batch is gone
ggplot(batch.pca.data,aes(x=PC1,y=PC2)) +
geom_point(aes(color=Genotype)) +
geom_text(aes(label=Sample), hjust=0, vjust=0, show.legend=F) +
coord_cartesian(clip='off')
ggplot(batch.pca.data,aes(x=PC1,y=PC2)) +
geom_point(aes(color=as.factor(Batch))) +
geom_text(aes(label=Sample), hjust=0, vjust=0, show.legend=F) +
coord_cartesian(clip='off')
ABOUT: This is a clinical data set (46 samples), where some patients had a disease and others did not.
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/cont_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 57230 46
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/cont_metadata.txt",
header=T,row.names=1)
head(metadata)
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 57230 46
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 19827 46
# first let's take a look at the scores a bit more
# check a histogram of the scores
hist(metadata$Phenotype_Score)
# check if there's an association between score and disease
boxplot(Phenotype_Score ~ Disease, data=metadata)
kruskal.test(Phenotype_Score ~ Disease, data=metadata)
##
## Kruskal-Wallis rank sum test
##
## data: Phenotype_Score by Disease
## Kruskal-Wallis chi-squared = 0.27836, df = 1, p-value = 0.5978
This looks OK: if there was a strong association (small p-value), then the variables would be confounded and we couldn’t test them independently. So we can continue:
# define our variable/factor and model matrix
# use as.numeric for the score
score <- as.numeric(metadata[,1])
disease <- factor(metadata[,2])
model <- model.matrix(~ score + disease + score:disease)
head(model)
## (Intercept) score diseaseTRUE score:diseaseTRUE
## 1 1 82 0 0
## 2 1 90 0 0
## 3 1 65 1 65
## 4 1 90 1 90
## 5 1 49 1 49
## 6 1 43 1 43
# create edgeR object
genes <- DGEList(counts=data_subset)
# calculate TMM factors
genes <- calcNormFactors(genes)
# estimate dispersion
# this may take a minute - there are a lot of samples!
genes <- estimateDisp(genes,model)
# calculate BCV from the common dispersion
# note the BCV is much higher in this data set
# clinical data sets are often highly variable!
sqrt(genes$common.dispersion)
## [1] 2.027726
plotBCV(genes)
# fit the model
fit <- glmQLFit(genes,model)
# now we test each variable/factor in turn
# the continuous variable is always just 1 term
qlf.score <- glmQLFTest(fit, coef=2)
qlf.disease <- glmQLFTest(fit, coef=3)
qlf.inter <- glmQLFTest(fit, coef=4)
# do p-value adjustment
qlf.score$table$QValue <- p.adjust(qlf.score$table$PValue,method="BH")
qlf.disease$table$QValue <- p.adjust(qlf.disease$table$PValue,method="BH")
qlf.inter$table$QValue <- p.adjust(qlf.inter$table$PValue,method="BH")
# check number of significant genes due to:
# score
sum(qlf.score$table$QValue<0.05)
## [1] 1964
# disease
sum(qlf.disease$table$QValue<0.05)
## [1] 1114
# interaction term
sum(qlf.inter$table$QValue<0.05)
## [1] 847
# read in counts
data <- read.table("https://wd.cri.uic.edu/edgeR/norep_counts.txt",
header=T,row.names=1)
dim(data)
## [1] 24421 2
# read in metadata table
metadata <- read.table("https://wd.cri.uic.edu/edgeR/norep_metadata.txt",
header=T,row.names=1)
metadata
# make sure the metadata row order matches the data column order
data <- data[,rownames(metadata)]
# check the data again to make sure we didn't drop any columns!
dim(data)
## [1] 24421 2
# subset counts to genes with >20 total counts
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 14947 2
# create edgeR object
genes <- DGEList(counts=data_subset, group=metadata[,1])
# calculate TMM factors
genes <- calcNormFactors(genes)
# instead of estimating dispersion, we'll just set it
# let's assume a BCV of 0.2 (20%)
bcv <- 0.2
# run differential, specifying the dispersion to use
stats <- exactTest(genes, dispersion=bcv^2)
# run FDR correction
stats$table$QValue <- p.adjust(stats$table$PValue,method="BH")
# sort by p-value to prioritize top genes
head(stats$table[order(stats$table$PValue),])
This uses data associated with the one-way ANOVA (Exercises 1.5 and 1.6). But we will run pairwise comparisons between all pairs of the factor levels.
# read in, reconicle, and filter data as usual
data <- read.table("https://wd.cri.uic.edu/edgeR/anova_counts.txt",
header=T,row.names=1)
metadata <- read.table("https://wd.cri.uic.edu/edgeR/anova_metadata.txt",
header=T,row.names=1)
data <- data[,rownames(metadata)]
data_subset <- data[rowSums(data)>20,]
dim(data_subset)
## [1] 16713 12
treat <- factor(metadata[,1])
# start a new data frame to store the results in
pw_stats <- data.frame(Gene = rownames(data_subset))
# get a list of the levels, and the number
levs <- levels(treat)
nlevs <- length(levs)
# loop over all pairs of levels
for(i in 1:(nlevs-1)){
for(j in (i+1):nlevs){
# names for these groups
A <- levs[i]
B <- levs[j]
# get the logical vectors for these groups
groupA <- A == treat
groupB <- B == treat
# subset the data for groupA or groupB data
pw_metadata <- treat[as.logical(groupA+groupB)]
pw_counts <- data_subset[,as.logical(groupA+groupB)]
# run our edgeR commands
pw_genes <- DGEList(counts=pw_counts, group=pw_metadata)
pw_genes <- calcNormFactors(pw_genes)
pw_genes <- estimateDisp(pw_genes)
pw_test <- exactTest(pw_genes)
pw_test$table$QValue <- p.adjust(pw_test$table$PValue,method="BH")
# add the name of this comparison to the column names
# edgeR does the comparisons as groupB / groupA, so
# we list groupB first
pw_name <- paste(B, A, sep="/")
colnames(pw_test$table) = paste(pw_name, ":", colnames(pw_test$table))
# add results to the pairwise list
pw_stats <- cbind(pw_stats, pw_test$table)
}
}
## Using classic mode.
## Using classic mode.
## Using classic mode.
head(pw_stats)
NOTE: you may find it useful to remake the above code as a function, which will allow you to rerun it again more easily on a new data set. For example:
pw_tests <- function( counts, groups ){
# start a new data frame to store the results in
pw_stats <- data.frame(Gene = rownames(counts))
# get a list of the levels, and the number
levs <- levels(groups)
nlevs <- length(levs)
# loop over all pairs of levels
for (i in 1:(nlevs - 1)) {
for (j in (i + 1):nlevs) {
# names for these groups
A <- levs[i]
B <- levs[j]
# get the logical vectors for these groups
groupA <- A == groups
groupB <- B == groups
# subset the data for groupA or groupB data
pw_metadata <- treat[as.logical(groupA + groupB)]
pw_counts <- data_subset[, as.logical(groupA + groupB)]
# run our edgeR commands
pw_genes <- DGEList(counts = pw_counts, group = pw_metadata)
pw_genes <- calcNormFactors(pw_genes)
pw_genes <- estimateDisp(pw_genes)
pw_test <- exactTest(pw_genes)
pw_test$table$QValue <- p.adjust(pw_test$table$PValue, method = "BH")
# add the name of this comparison to the column names
# edgeR does the comparisons as groupB / groupA, so
# we list groupB first
pw_name <- paste(B, A, sep = "/")
colnames(pw_test$table) = paste(pw_name, ":", colnames(pw_test$table))
# add results to the pairwise list
pw_stats <- cbind(pw_stats, pw_test$table)
}
}
return(pw_stats)
}
# to run the function:
all_pairwise <- pw_tests( data_subset, treat )
## Using classic mode.
## Using classic mode.
## Using classic mode.
head(all_pairwise)
Use the biomaRt package in from bioconductor. For first time use we need to install it.
BiocManager::install("biomaRt")
library(biomaRt)
data <- read.table("https://wd.cri.uic.edu/edgeR/mouse_ensembl_rnaseq.txt",
header=T,row.names=1)
head(data)
Note that the identifiers are:
ENS).MUS). If no three-letter code is shown, then
they’re human IDs.G)..[number]). Will need to
remove versions before looking up symbols.# get rid of version numbers in transcript names
# this is is also necessary if you're going to upload to a pathway database
# this is an example of REGULAR EXPRESSIONS, which we've seen a little
# of also in grep and sed commands in linux. Very powerful!
head(rownames(data))
## [1] "ENSMUSG00000051951.5" "ENSMUSG00000102851.1" "ENSMUSG00000103147.1"
## [4] "ENSMUSG00000102331.1" "ENSMUSG00000102343.1" "ENSMUSG00000025900.12"
newnames <- gsub("\\.[0-9]*","",rownames(data))
head(newnames)
## [1] "ENSMUSG00000051951" "ENSMUSG00000102851" "ENSMUSG00000103147"
## [4] "ENSMUSG00000102331" "ENSMUSG00000102343" "ENSMUSG00000025900"
Now convert the IDs to gene symbols. Check the manual for the
biomaRt package for more details. In particular:
For today, we already know the database
(mmusculus_gene_ensembl) and attributes we want
(ensembl_gene_id and gene symbol AKA
external_gene_name). Note that both of these commands may
take some time to run, since they are fetching data from an online
database hosted by Ensembl.
# get the database of mouse gene annotations
mart <- useDataset("mmusculus_gene_ensembl",useMart("ensembl"))
gene_list <- getBM(filters="ensembl_gene_id",
attributes=c("ensembl_gene_id","external_gene_name"),
values=newnames, mart=mart)
head(gene_list)
# note that not ALL IDs may get matched
length(newnames)
## [1] 39056
nrow(gene_list)
## [1] 38535
# also note that gene symbols are NOT unique relative to the gene IDs
length(unique(gene_list[,1]))
## [1] 38535
length(unique(gene_list[,2]))
## [1] 38492
Now that you have the list, you can combine with an existing data
frame. In the example below, we combine with the counts table using
merge().
NOTE: usually you would want to keep the original IDs as the main identifiers throughout your analysis in edgeR, and then merge the symbols or other information with your normalized expression (CPM) or differential stats tables AFTER running the analysis. You can’t have a mix of symbols and other strings in your counts table when processing through edgeR.
# replace the row names with the IDs after stripping off the version numbers
rownames(data) = newnames
# merge based on ensembl_gene_id and rownames
# keep all entries. unmatched IDs will have NA values for the gene symbol
data.named <- merge(data, gene_list,
by.x="row.names", by.y="ensembl_gene_id", all=T)
head(data.named)
(Start from the end of Exercise 2.2 to lead into this exercise.)
There’s another way we could consider filtering for genes with a significant interaction: just filter directly on the interaction term, ignoring the independent effects of disease. That is:
any_inter <- norm.z[inter.sel,]
Or, we could separate just the significant interactions from those that also have independent disease effects:
inter_only <- norm.z[inter.sel & !disease.sel,]
Try a heatmap with the second option:
Heatmap(inter_only,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F,
column_split = tissue)
It looks like we have a clear effect in sciatic nerve, but the
picture in DRG is murkier. Assuming you’re looking at
norm.z with values z-scored separately for each tissue,
it’s a little hard to assess the relative effect in DRG because we’ve
z-scored.
We will try running one-way ANOVA tests separately in each tissue.
samples.drg <- tissue=="DRG"
samples.sn <- tissue=="Sciatic_nerve"
data.drg <- data_subset[,samples.drg]
data.sn <- data_subset[,samples.sn]
disease.drg <- disease[samples.drg]
disease.sn <- disease[samples.sn]
model.drg <- model.matrix(~disease.drg)
model.sn <- model.matrix(~disease.sn)
genes.drg <- DGEList(data.drg)
genes.sn <- DGEList(data.sn)
genes.drg <- calcNormFactors(genes.drg)
genes.sn <- calcNormFactors(genes.sn)
genes.drg <- estimateDisp(genes.drg,model.drg)
genes.sn <- estimateDisp(genes.sn,model.sn)
fit.drg <- glmQLFit(genes.drg,model.drg)
fit.sn <- glmQLFit(genes.sn,model.sn)
test.drg <- glmQLFTest(fit.drg, coef=2:3)
test.sn <- glmQLFTest(fit.sn, coef=2:3)
test.drg$table$QValue <- p.adjust(test.drg$table$PValue,method="BH")
test.sn$table$QValue <- p.adjust(test.sn$table$PValue,method="BH")
# count the number of DEGs
sum(test.drg$table$QValue<0.05)
## [1] 921
sum(test.sn$table$QValue<0.05)
## [1] 3684
# which of these genes were detected before in the "disease" factor?
sum(test.drg$table$QValue<0.05 & disease.sel)
## [1] 548
sum(test.sn$table$QValue<0.05 & disease.sel)
## [1] 365
It does look like the general disease effect is stronger in sciatic nerve. Remake the heatmap, with a row annotation indicating the significance for each tissue separately.
# prepare annotations for the heatmap based on -log10 FDR
signif.drg <- -log10(test.drg$table$QValue)
signif.sn <- -log10(test.sn$table$QValue)
labels.drg <- signif.drg[!disease.sel & inter.sel]
labels.sn <- signif.sn[!disease.sel & inter.sel]
# set an alternate color scale for significance levels
col_signif <- colorRamp2(c(0,-log10(0.05),5),c("white","red","yellow"))
# use the same color scale for both annotations
anova.label <- rowAnnotation(DRG=labels.drg, SN=labels.sn,
col=list(DRG=col_signif, SN=col_signif))
Heatmap(inter_only,col=col_fun,name="Z-scored log2 CPM",
top_annotation=col_labels, show_row_names=F, show_column_names=F,
column_split = tissue, left_annotation = anova.label)
All of these show a significant effect in sciatic nerve, whereas only some of them show it in DRG.
For this exercise, we’ll use a shotgun metagenomics data set, with taxonomic quantification summarized at a phylum level. These are data obtained from a mouse testing a surgery injury model, and a treatment for the injury. We have two experimental factors: (1) Injury, with levels Naive, Sham, and Surgery; and (2) Treatment, with levels treated and control.
# load the edgeR and ggplot2 libraries
library(edgeR)
library(ggplot2)
# read in the counts data
data <- read.table("https://wd.cri.uic.edu/edgeR/metagenomics_counts.txt",
sep="\t",header=T,row.names=1)
# observe that the features are taxonomic names, down to phylum level
# all are for bacteria
head(rownames(data))
## [1] "sk_Bacteria;k__Bacteria incertae sedis;p__Caldiserica"
## [2] "sk_Bacteria;k__Bacteria incertae sedis;p__Candidatus Levybacteria"
## [3] "sk_Bacteria;k__Bacteria incertae sedis;p__Candidatus Magasanikbacteria"
## [4] "sk_Bacteria;k__Bacteria incertae sedis;p__Thermotogae"
## [5] "sk_Bacteria;k__Bacteria incertae sedis;p__Candidatus Azambacteria"
## [6] "sk_Bacteria;k__Bacteria incertae sedis;p__Thermodesulfobacteria"
# simplify taxonomic names to just the phylum
rownames(data) <- gsub("^.*;p__","",rownames(data))
head(rownames(data))
## [1] "Caldiserica" "Candidatus Levybacteria"
## [3] "Candidatus Magasanikbacteria" "Thermotogae"
## [5] "Candidatus Azambacteria" "Thermodesulfobacteria"
# also read in metadata
metadata <- read.table("https://wd.cri.uic.edu/edgeR/metagenomics_metadata.txt",
header=T,row.names=1)
# match metadata and data, filter the taxa
data <- data[,rownames(metadata)]
data_subset <- data[rowSums(data)>20,]
# run just a few steps in edgeR to get normalized abundance
# you can run differential stats also, but you should be an expert on that by now
# so we omit those steps here
taxa <- DGEList(counts = data_subset)
taxa <- calcNormFactors(taxa)
taxa.norm <- cpm(taxa,log=T)
# run PCA analysis and make a data frame with metadata
pca <- prcomp(t(taxa.norm))
pca.data <- cbind(pca$x, metadata)
# colored by Injury and Treatment
ggplot(pca.data,aes(x=PC1,y=PC2,color=Injury)) + geom_point()
ggplot(pca.data,aes(x=PC1,y=PC2,color=Treatment)) + geom_point()
# facet by Treatment
ggplot(pca.data,aes(x=PC1,y=PC2,color=Treatment)) + geom_point() +
facet_grid(~Injury)
# plot each factor, add confidence ellipse
ggplot(pca.data,aes(x=PC1,y=PC2,color=Treatment)) + geom_point() +
facet_grid(~Injury) + stat_ellipse()
The Firmicutes/Bacteroidetes ratio is a common measure of the health of a gut microbiome. We can add that value to the PCA plot as the dot colors.
# add counts for Bacteroidetes and Firmicutes
pca.data.expanded <- cbind(pca.data,
Bacteroidetes = t(data["Bacteroidetes",]),
Firmicutes = t(data["Firmicutes",]))
# color by Firmicutes/Bacteriodetes ratio
ggplot(pca.data.expanded,aes(x=PC1,y=PC2,color=Firmicutes/Bacteroidetes)) +
geom_point()
# color by the log2-ratio
ggplot(pca.data.expanded,aes(x=PC1,y=PC2,color=log2(Firmicutes/Bacteroidetes))) +
geom_point()
Make add a biplot, which allows us to visualize the contribution of each feature on the principle components. Note that you may want to be careful with this plot if you have a huge number of features, but with a relatively small number of taxa it will work.
# using the built-in biplot function
biplot(pca)
# the setup is more manual with ggplot2, but
# we have more control over the plotting aesthetics
# first, get the rotation matrix as a data frame
rotation <- data.frame(taxa=rownames(pca$rotation), pca$rotation)
# determine a scalar to multiply the vector lengths by
# so that the scales line up better
mult <- min(
(max(pca.data[,"PC2"]) - min(pca.data[,"PC2"])/(max(rotation[,"PC2"])-min(rotation[,"PC2"]))),
(max(pca.data[,"PC1"]) - min(pca.data[,"PC1"])/(max(rotation[,"PC1"])-min(rotation[,"PC1"])))
)
rotation$PC1 = mult * rotation$PC1
rotation$PC2 = mult * rotation$PC2
# start making the plot: first the main PCA scatterplot
plot <- ggplot(pca.data, aes(x=PC1, y=PC2,color=Treatment)) + geom_point()
# add vertical and horizontal lines for the middle of the biplot
plot <- plot + geom_hline(yintercept=0) + geom_vline(xintercept=0)
# add text labels for each taxa
# set clip="off" to let text labels go outside the plot area
plot <- plot + coord_equal(clip='off') +
geom_text(data=rotation, aes(x=PC1, y=PC2, label=taxa),
size=3, hjust=0, vjust=0, color="red")
# add arrows for the biplot: from the origin (0,0)
# to the PC coordinate in the rotation matrix
plot <- plot + geom_segment(data=rotation, aes(x=0, y=0, xend=PC1, yend=PC2),
arrow=arrow(length=unit(0.2,"cm")), color="red")
plot